import os # Import the 'os' module
import numpy as np # NumPy for numerical operations
import pandas as pd # Data processing, CSV file I/O (e.g. pd.read_csv)
import plotly.express as px # Plotly Express for interactive visualizations
import matplotlib.pyplot as plt # Graphing
import seaborn as sns # Graphing
import plotly.graph_objects as go # Library used for modified boxplot
from empiricaldist import Pmf, Cdf # Import Pmf and Cdf classes from the 'empiricaldist' module
from datetime import timedelta # Import the 'timedelta' class from the 'datetime' module
from sklearn.model_selection import train_test_split # Import 'train_test_split' from 'sklearn.model_selection', which is used to split a dataset into training and testing subsets
from sklearn.linear_model import LinearRegression # Import the 'LinearRegression' class from 'sklearn.linear_model', which is used to create a linear regression model
from sklearn.tree import DecisionTreeRegressor # Import the DecisionTreeRegressor class from the scikit-learn library
from sklearn.ensemble import RandomForestRegressor # Import the RandomForestRegressor class from the scikit-learn library
from sklearn.ensemble import GradientBoostingRegressor # Import the GradientBoostingRegressor class from the scikit-learn library
from sklearn.metrics import mean_squared_error, mean_absolute_error # Import the 'mean_squared_error' and 'mean_absolute_error' functions from 'sklearn.metrics', which are used for evaluating regression models
from matplotlib.patches import Patch # Import the Patch class from the matplotlib.patches module
from copy import copy # Import the copy function from the copy module
# Read the CSV file located at the specified path into the 'dataset' DataFrame using pandas (pd)
dataset = pd.read_csv('C:/Users/Mohammad Navid/Desktop/US_AQI.csv')
# Filter the Data from May 2020 to May 2021"
filtered_data_1 = dataset[(dataset['Date'] >= '2020-05-01') & (dataset['Date'] <= '2021-05-31')]
# Filter the Data from June 2021 to May 2022"
filtered_data_2 = dataset[(dataset['Date'] >= '2021-06-01') & (dataset['Date'] <= '2022-05-31')]
# Filter the Data from May 2020 to May 2022"
full_data = dataset[(dataset['Date'] >= '2020-05-01') & (dataset['Date'] <= '2022-05-31')]
# Show full data table
full_data
| Unnamed: 0 | CBSA Code | Date | AQI | Category | Defining Parameter | Number of Sites Reporting | city_ascii | state_id | state_name | lat | lng | population | density | timezone | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 10140 | 2022-01-01 | 21 | Good | PM2.5 | 2 | Aberdeen | WA | Washington | 46.9757 | -123.8094 | 16571.0 | 588.0 | America/Los_Angeles |
| 1 | 1 | 10140 | 2022-01-02 | 12 | Good | PM2.5 | 2 | Aberdeen | WA | Washington | 46.9757 | -123.8094 | 16571.0 | 588.0 | America/Los_Angeles |
| 2 | 2 | 10140 | 2022-01-03 | 18 | Good | PM2.5 | 2 | Aberdeen | WA | Washington | 46.9757 | -123.8094 | 16571.0 | 588.0 | America/Los_Angeles |
| 3 | 3 | 10140 | 2022-01-04 | 19 | Good | PM2.5 | 2 | Aberdeen | WA | Washington | 46.9757 | -123.8094 | 16571.0 | 588.0 | America/Los_Angeles |
| 4 | 4 | 10140 | 2022-01-05 | 17 | Good | PM2.5 | 2 | Aberdeen | WA | Washington | 46.9757 | -123.8094 | 16571.0 | 588.0 | America/Los_Angeles |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 359311 | 365463 | 49740 | 2020-12-27 | 32 | Good | Ozone | 1 | Yuma | AZ | Arizona | 32.5995 | -114.5491 | 137612.0 | 311.0 | America/Phoenix |
| 359312 | 365464 | 49740 | 2020-12-28 | 44 | Good | PM10 | 1 | Yuma | AZ | Arizona | 32.5995 | -114.5491 | 137612.0 | 311.0 | America/Phoenix |
| 359313 | 365465 | 49740 | 2020-12-29 | 38 | Good | Ozone | 1 | Yuma | AZ | Arizona | 32.5995 | -114.5491 | 137612.0 | 311.0 | America/Phoenix |
| 359314 | 365466 | 49740 | 2020-12-30 | 36 | Good | Ozone | 1 | Yuma | AZ | Arizona | 32.5995 | -114.5491 | 137612.0 | 311.0 | America/Phoenix |
| 359315 | 365467 | 49740 | 2020-12-31 | 35 | Good | PM2.5 | 1 | Yuma | AZ | Arizona | 32.5995 | -114.5491 | 137612.0 | 311.0 | America/Phoenix |
305077 rows × 15 columns
# Add the values from the \"AQI\" column to each parameter column
filtered_data_1['CO'] = np.where((filtered_data_1['Defining Parameter'] == 'CO'), filtered_data_1['AQI'], np.nan)
filtered_data_1['NO2'] = np.where((filtered_data_1['Defining Parameter'] == 'NO2'), filtered_data_1['AQI'], np.nan)
filtered_data_1['Ozone'] = np.where((filtered_data_1['Defining Parameter'] == 'Ozone'), filtered_data_1['AQI'], np.nan)
filtered_data_1['PM10'] = np.where((filtered_data_1['Defining Parameter'] == 'PM10'), filtered_data_1['AQI'], np.nan)
filtered_data_1['PM2.5'] = np.where((filtered_data_1['Defining Parameter'] == 'PM2.5'), filtered_data_1['AQI'], np.nan)
# Replace NaN to O from these columns
filtered_data_1['CO'] = filtered_data_1['CO'].fillna(0)
filtered_data_1['NO2'] = filtered_data_1['NO2'].fillna(0)
filtered_data_1['Ozone'] = filtered_data_1['Ozone'].fillna(0)
filtered_data_1['PM10'] = filtered_data_1['PM10'].fillna(0)
filtered_data_1['PM2.5'] = filtered_data_1['PM2.5'].fillna(0)
# Convert specific columns to string data type
filtered_data_1['CO'] = filtered_data_1['CO'].astype(str)
filtered_data_1['NO2'] = filtered_data_1['NO2'].astype(str)
filtered_data_1['Ozone'] = filtered_data_1['Ozone'].astype(str)
filtered_data_1['PM10'] = filtered_data_1['PM10'].astype(str)
filtered_data_1['PM2.5'] = filtered_data_1['PM2.5'].astype(str)
# Drop unnecessary columns from filtered_data_1
filtered_data_1.drop(['Unnamed: 0'], axis=1, inplace=True)
filtered_data_1.drop(['Defining Parameter'], axis=1, inplace=True)
filtered_data_1.drop(['timezone'], axis=1, inplace=True)
# Removing 'District of Columbia' and 'Puerto Rico' from filtered_data_1
filtered_data_1 = filtered_data_1[~filtered_data_1['state_name'].isin(['District of Columbia', 'Puerto Rico'])]
C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\4089107575.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy filtered_data_1['CO'] = np.where((filtered_data_1['Defining Parameter'] == 'CO'), filtered_data_1['AQI'], np.nan) C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\4089107575.py:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy filtered_data_1['NO2'] = np.where((filtered_data_1['Defining Parameter'] == 'NO2'), filtered_data_1['AQI'], np.nan) C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\4089107575.py:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy filtered_data_1['Ozone'] = np.where((filtered_data_1['Defining Parameter'] == 'Ozone'), filtered_data_1['AQI'], np.nan) C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\4089107575.py:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy filtered_data_1['PM10'] = np.where((filtered_data_1['Defining Parameter'] == 'PM10'), filtered_data_1['AQI'], np.nan) C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\4089107575.py:6: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy filtered_data_1['PM2.5'] = np.where((filtered_data_1['Defining Parameter'] == 'PM2.5'), filtered_data_1['AQI'], np.nan) C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\4089107575.py:9: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy filtered_data_1['CO'] = filtered_data_1['CO'].fillna(0) C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\4089107575.py:10: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy filtered_data_1['NO2'] = filtered_data_1['NO2'].fillna(0) C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\4089107575.py:11: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy filtered_data_1['Ozone'] = filtered_data_1['Ozone'].fillna(0) C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\4089107575.py:12: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy filtered_data_1['PM10'] = filtered_data_1['PM10'].fillna(0) C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\4089107575.py:13: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy filtered_data_1['PM2.5'] = filtered_data_1['PM2.5'].fillna(0) C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\4089107575.py:16: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy filtered_data_1['CO'] = filtered_data_1['CO'].astype(str) C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\4089107575.py:17: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy filtered_data_1['NO2'] = filtered_data_1['NO2'].astype(str) C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\4089107575.py:18: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy filtered_data_1['Ozone'] = filtered_data_1['Ozone'].astype(str) C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\4089107575.py:19: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy filtered_data_1['PM10'] = filtered_data_1['PM10'].astype(str) C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\4089107575.py:20: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy filtered_data_1['PM2.5'] = filtered_data_1['PM2.5'].astype(str) C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\4089107575.py:23: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy filtered_data_1.drop(['Unnamed: 0'], axis=1, inplace=True) C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\4089107575.py:24: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy filtered_data_1.drop(['Defining Parameter'], axis=1, inplace=True) C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\4089107575.py:25: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy filtered_data_1.drop(['timezone'], axis=1, inplace=True)
# Show date from May 2020 to May 2021
filtered_data_1
| CBSA Code | Date | AQI | Category | Number of Sites Reporting | city_ascii | state_id | state_name | lat | lng | population | density | CO | NO2 | Ozone | PM10 | PM2.5 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 25831 | 10100 | 2021-01-01 | 43 | Good | 1 | Aberdeen | SD | South Dakota | 45.4649 | -98.4686 | 28315.0 | 661.0 | 0.0 | 0.0 | 0.0 | 0.0 | 43.0 |
| 25832 | 10100 | 2021-01-02 | 35 | Good | 1 | Aberdeen | SD | South Dakota | 45.4649 | -98.4686 | 28315.0 | 661.0 | 0.0 | 0.0 | 0.0 | 0.0 | 35.0 |
| 25833 | 10100 | 2021-01-03 | 55 | Moderate | 1 | Aberdeen | SD | South Dakota | 45.4649 | -98.4686 | 28315.0 | 661.0 | 0.0 | 0.0 | 0.0 | 0.0 | 55.0 |
| 25834 | 10100 | 2021-01-04 | 7 | Good | 1 | Aberdeen | SD | South Dakota | 45.4649 | -98.4686 | 28315.0 | 661.0 | 0.0 | 0.0 | 0.0 | 0.0 | 7.0 |
| 25835 | 10100 | 2021-01-05 | 12 | Good | 1 | Aberdeen | SD | South Dakota | 45.4649 | -98.4686 | 28315.0 | 661.0 | 0.0 | 0.0 | 0.0 | 0.0 | 12.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 359311 | 49740 | 2020-12-27 | 32 | Good | 1 | Yuma | AZ | Arizona | 32.5995 | -114.5491 | 137612.0 | 311.0 | 0.0 | 0.0 | 32.0 | 0.0 | 0.0 |
| 359312 | 49740 | 2020-12-28 | 44 | Good | 1 | Yuma | AZ | Arizona | 32.5995 | -114.5491 | 137612.0 | 311.0 | 0.0 | 0.0 | 0.0 | 44.0 | 0.0 |
| 359313 | 49740 | 2020-12-29 | 38 | Good | 1 | Yuma | AZ | Arizona | 32.5995 | -114.5491 | 137612.0 | 311.0 | 0.0 | 0.0 | 38.0 | 0.0 | 0.0 |
| 359314 | 49740 | 2020-12-30 | 36 | Good | 1 | Yuma | AZ | Arizona | 32.5995 | -114.5491 | 137612.0 | 311.0 | 0.0 | 0.0 | 36.0 | 0.0 | 0.0 |
| 359315 | 49740 | 2020-12-31 | 35 | Good | 1 | Yuma | AZ | Arizona | 32.5995 | -114.5491 | 137612.0 | 311.0 | 0.0 | 0.0 | 0.0 | 0.0 | 35.0 |
179923 rows × 17 columns
# Summary Statistics of filtered_data_1
filtered_data_1.describe().style.background_gradient(cmap = "plasma")
| CBSA Code | AQI | Number of Sites Reporting | lat | lng | population | density | |
|---|---|---|---|---|---|---|---|
| count | 179923.000000 | 179923.000000 | 179923.000000 | 179923.000000 | 179923.000000 | 179923.000000 | 179923.000000 |
| mean | 29966.728267 | 41.264936 | 3.384398 | 38.688751 | -95.333500 | 454618.016768 | 999.460708 |
| std | 11357.713469 | 24.531330 | 4.928993 | 5.638516 | 16.910286 | 1372391.444181 | 919.182145 |
| min | 10100.000000 | 0.000000 | 1.000000 | 19.688300 | -159.352100 | 1903.000000 | 4.000000 |
| 25% | 19980.000000 | 29.000000 | 1.000000 | 34.933300 | -109.220900 | 24479.000000 | 509.000000 |
| 50% | 29620.000000 | 38.000000 | 2.000000 | 39.465300 | -90.241200 | 98775.000000 | 777.000000 |
| 75% | 40140.000000 | 48.000000 | 3.000000 | 42.444200 | -82.151100 | 291388.000000 | 1183.000000 |
| max | 49740.000000 | 1250.000000 | 45.000000 | 64.835300 | -68.790600 | 18680025.000000 | 10768.000000 |
# Information Overview of filtered_data_1
filtered_data_1.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 179923 entries, 25831 to 359315 Data columns (total 17 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CBSA Code 179923 non-null int64 1 Date 179923 non-null object 2 AQI 179923 non-null int64 3 Category 179923 non-null object 4 Number of Sites Reporting 179923 non-null int64 5 city_ascii 179923 non-null object 6 state_id 179923 non-null object 7 state_name 179923 non-null object 8 lat 179923 non-null float64 9 lng 179923 non-null float64 10 population 179923 non-null float64 11 density 179923 non-null float64 12 CO 179923 non-null object 13 NO2 179923 non-null object 14 Ozone 179923 non-null object 15 PM10 179923 non-null object 16 PM2.5 179923 non-null object dtypes: float64(4), int64(3), object(10) memory usage: 24.7+ MB
# Shape of filtered_data_1
filtered_data_1.shape
(179923, 17)
# Add the values from the \"AQI\" column to each parameter column
filtered_data_2['CO'] = np.where((filtered_data_2['Defining Parameter'] == 'CO'), filtered_data_2['AQI'], np.nan)
filtered_data_2['NO2'] = np.where((filtered_data_2['Defining Parameter'] == 'NO2'), filtered_data_2['AQI'], np.nan)
filtered_data_2['Ozone'] = np.where((filtered_data_2['Defining Parameter'] == 'Ozone'), filtered_data_2['AQI'], np.nan)
filtered_data_2['PM10'] = np.where((filtered_data_2['Defining Parameter'] == 'PM10'), filtered_data_2['AQI'], np.nan)
filtered_data_2['PM2.5'] = np.where((filtered_data_2['Defining Parameter'] == 'PM2.5'), filtered_data_2['AQI'], np.nan)
# Replace NaN to O from these columns
filtered_data_2['CO'] = filtered_data_2['CO'].fillna(0)
filtered_data_2['NO2'] = filtered_data_2['NO2'].fillna(0)
filtered_data_2['Ozone'] = filtered_data_2['Ozone'].fillna(0)
filtered_data_2['PM10'] = filtered_data_2['PM10'].fillna(0)
filtered_data_2['PM2.5'] = filtered_data_2['PM2.5'].fillna(0)
# Convert specific columns to string data type
filtered_data_2['CO'] = filtered_data_2['CO'].astype(str)
filtered_data_2['NO2'] = filtered_data_2['NO2'].astype(str)
filtered_data_2['Ozone'] = filtered_data_2['Ozone'].astype(str)
filtered_data_2['PM10'] = filtered_data_2['PM10'].astype(str)
filtered_data_2['PM2.5'] = filtered_data_2['PM2.5'].astype(str)
# Drop unnecessary columns from filtered_data_2
filtered_data_2.drop(['Unnamed: 0'], axis=1, inplace=True)
filtered_data_2.drop(['Defining Parameter'], axis=1, inplace=True)
filtered_data_2.drop(['timezone'], axis=1, inplace=True)
# Removing 'District of Columbia' and 'Puerto Rico' from filtered_data_2
filtered_data_2 = filtered_data_2[~filtered_data_2['state_name'].isin(['District of Columbia', 'Puerto Rico'])]
C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\2543847843.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy filtered_data_2['CO'] = np.where((filtered_data_2['Defining Parameter'] == 'CO'), filtered_data_2['AQI'], np.nan) C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\2543847843.py:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy filtered_data_2['NO2'] = np.where((filtered_data_2['Defining Parameter'] == 'NO2'), filtered_data_2['AQI'], np.nan) C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\2543847843.py:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy filtered_data_2['Ozone'] = np.where((filtered_data_2['Defining Parameter'] == 'Ozone'), filtered_data_2['AQI'], np.nan) C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\2543847843.py:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy filtered_data_2['PM10'] = np.where((filtered_data_2['Defining Parameter'] == 'PM10'), filtered_data_2['AQI'], np.nan) C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\2543847843.py:6: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy filtered_data_2['PM2.5'] = np.where((filtered_data_2['Defining Parameter'] == 'PM2.5'), filtered_data_2['AQI'], np.nan) C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\2543847843.py:9: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy filtered_data_2['CO'] = filtered_data_2['CO'].fillna(0) C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\2543847843.py:10: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy filtered_data_2['NO2'] = filtered_data_2['NO2'].fillna(0) C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\2543847843.py:11: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy filtered_data_2['Ozone'] = filtered_data_2['Ozone'].fillna(0) C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\2543847843.py:12: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy filtered_data_2['PM10'] = filtered_data_2['PM10'].fillna(0) C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\2543847843.py:13: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy filtered_data_2['PM2.5'] = filtered_data_2['PM2.5'].fillna(0) C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\2543847843.py:16: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy filtered_data_2['CO'] = filtered_data_2['CO'].astype(str) C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\2543847843.py:17: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy filtered_data_2['NO2'] = filtered_data_2['NO2'].astype(str) C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\2543847843.py:18: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy filtered_data_2['Ozone'] = filtered_data_2['Ozone'].astype(str) C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\2543847843.py:19: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy filtered_data_2['PM10'] = filtered_data_2['PM10'].astype(str) C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\2543847843.py:20: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy filtered_data_2['PM2.5'] = filtered_data_2['PM2.5'].astype(str) C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\2543847843.py:23: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy filtered_data_2.drop(['Unnamed: 0'], axis=1, inplace=True) C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\2543847843.py:24: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy filtered_data_2.drop(['Defining Parameter'], axis=1, inplace=True) C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\2543847843.py:25: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy filtered_data_2.drop(['timezone'], axis=1, inplace=True)
# Show date from June 2021 to May 2022
filtered_data_2
| CBSA Code | Date | AQI | Category | Number of Sites Reporting | city_ascii | state_id | state_name | lat | lng | population | density | CO | NO2 | Ozone | PM10 | PM2.5 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 10140 | 2022-01-01 | 21 | Good | 2 | Aberdeen | WA | Washington | 46.9757 | -123.8094 | 16571.0 | 588.0 | 0.0 | 0.0 | 0.0 | 0.0 | 21.0 |
| 1 | 10140 | 2022-01-02 | 12 | Good | 2 | Aberdeen | WA | Washington | 46.9757 | -123.8094 | 16571.0 | 588.0 | 0.0 | 0.0 | 0.0 | 0.0 | 12.0 |
| 2 | 10140 | 2022-01-03 | 18 | Good | 2 | Aberdeen | WA | Washington | 46.9757 | -123.8094 | 16571.0 | 588.0 | 0.0 | 0.0 | 0.0 | 0.0 | 18.0 |
| 3 | 10140 | 2022-01-04 | 19 | Good | 2 | Aberdeen | WA | Washington | 46.9757 | -123.8094 | 16571.0 | 588.0 | 0.0 | 0.0 | 0.0 | 0.0 | 19.0 |
| 4 | 10140 | 2022-01-05 | 17 | Good | 2 | Aberdeen | WA | Washington | 46.9757 | -123.8094 | 16571.0 | 588.0 | 0.0 | 0.0 | 0.0 | 0.0 | 17.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 192459 | 49740 | 2021-12-27 | 34 | Good | 1 | Yuma | AZ | Arizona | 32.5995 | -114.5491 | 137612.0 | 311.0 | 0.0 | 0.0 | 0.0 | 0.0 | 34.0 |
| 192460 | 49740 | 2021-12-28 | 34 | Good | 1 | Yuma | AZ | Arizona | 32.5995 | -114.5491 | 137612.0 | 311.0 | 0.0 | 0.0 | 34.0 | 0.0 | 0.0 |
| 192461 | 49740 | 2021-12-29 | 35 | Good | 1 | Yuma | AZ | Arizona | 32.5995 | -114.5491 | 137612.0 | 311.0 | 0.0 | 0.0 | 35.0 | 0.0 | 0.0 |
| 192462 | 49740 | 2021-12-30 | 27 | Good | 1 | Yuma | AZ | Arizona | 32.5995 | -114.5491 | 137612.0 | 311.0 | 0.0 | 0.0 | 27.0 | 0.0 | 0.0 |
| 192463 | 49740 | 2021-12-31 | 27 | Good | 1 | Yuma | AZ | Arizona | 32.5995 | -114.5491 | 137612.0 | 311.0 | 0.0 | 0.0 | 27.0 | 0.0 | 0.0 |
122445 rows × 17 columns
# Summary Statistics of filtered_data_2
filtered_data_2.describe().style.background_gradient(cmap = "plasma")
| CBSA Code | AQI | Number of Sites Reporting | lat | lng | population | density | |
|---|---|---|---|---|---|---|---|
| count | 122445.000000 | 122445.000000 | 122445.000000 | 122445.000000 | 122445.000000 | 122445.000000 | 122445.000000 |
| mean | 29933.876189 | 42.221438 | 3.199779 | 38.487797 | -95.242031 | 483641.387986 | 1002.417240 |
| std | 11381.454986 | 23.981390 | 4.611150 | 5.560619 | 16.597819 | 1428792.594744 | 946.266079 |
| min | 10100.000000 | 0.000000 | 1.000000 | 19.688300 | -159.352100 | 1903.000000 | 4.000000 |
| 25% | 19780.000000 | 29.000000 | 1.000000 | 34.617500 | -108.567300 | 25290.000000 | 508.000000 |
| 50% | 29540.000000 | 38.000000 | 2.000000 | 39.305100 | -90.468100 | 107460.000000 | 776.000000 |
| 75% | 40140.000000 | 50.000000 | 3.000000 | 42.270500 | -82.197700 | 316743.000000 | 1188.000000 |
| max | 49740.000000 | 775.000000 | 44.000000 | 64.835300 | -68.790600 | 18680025.000000 | 10768.000000 |
# Information Overview of filtered_data_2
filtered_data_2.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 122445 entries, 0 to 192463 Data columns (total 17 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CBSA Code 122445 non-null int64 1 Date 122445 non-null object 2 AQI 122445 non-null int64 3 Category 122445 non-null object 4 Number of Sites Reporting 122445 non-null int64 5 city_ascii 122445 non-null object 6 state_id 122445 non-null object 7 state_name 122445 non-null object 8 lat 122445 non-null float64 9 lng 122445 non-null float64 10 population 122445 non-null float64 11 density 122445 non-null float64 12 CO 122445 non-null object 13 NO2 122445 non-null object 14 Ozone 122445 non-null object 15 PM10 122445 non-null object 16 PM2.5 122445 non-null object dtypes: float64(4), int64(3), object(10) memory usage: 16.8+ MB
# Shape of filtered_data_2
filtered_data_2.shape
(122445, 17)
# Add the values from the \"AQI\" column to each parameter column
full_data['CO'] = np.where((full_data['Defining Parameter'] == 'CO'), full_data['AQI'], np.nan)
full_data['NO2'] = np.where((full_data['Defining Parameter'] == 'NO2'), full_data['AQI'], np.nan)
full_data['Ozone'] = np.where((full_data['Defining Parameter'] == 'Ozone'), full_data['AQI'], np.nan)
full_data['PM10'] = np.where((full_data['Defining Parameter'] == 'PM10'), full_data['AQI'], np.nan)
full_data['PM2.5'] = np.where((full_data['Defining Parameter'] == 'PM2.5'), full_data['AQI'], np.nan)
# Replace NaN to O from these columns
full_data['CO'] = full_data['CO'].fillna(0)
full_data['NO2'] = full_data['NO2'].fillna(0)
full_data['Ozone'] = full_data['Ozone'].fillna(0)
full_data['PM10'] = full_data['PM10'].fillna(0)
full_data['PM2.5'] = full_data['PM2.5'].fillna(0)
# Convert specific columns to string data type
full_data['CO'] = full_data['CO'].astype(str)
full_data['NO2'] = full_data['NO2'].astype(str)
full_data['Ozone'] = full_data['Ozone'].astype(str)
full_data['PM10'] = full_data['PM10'].astype(str)
full_data['PM2.5'] = full_data['PM2.5'].astype(str)
# Drop unnecessary columns from full_data
full_data.drop(['Unnamed: 0'], axis=1, inplace=True)
full_data.drop(['timezone'], axis=1, inplace=True)
# Removing 'District of Columbia' and 'Puerto Rico' from full_data
full_data = full_data[~full_data['state_name'].isin(['District of Columbia', 'Puerto Rico'])]
C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\2661443164.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy full_data['CO'] = np.where((full_data['Defining Parameter'] == 'CO'), full_data['AQI'], np.nan) C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\2661443164.py:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy full_data['NO2'] = np.where((full_data['Defining Parameter'] == 'NO2'), full_data['AQI'], np.nan) C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\2661443164.py:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy full_data['Ozone'] = np.where((full_data['Defining Parameter'] == 'Ozone'), full_data['AQI'], np.nan) C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\2661443164.py:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy full_data['PM10'] = np.where((full_data['Defining Parameter'] == 'PM10'), full_data['AQI'], np.nan) C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\2661443164.py:6: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy full_data['PM2.5'] = np.where((full_data['Defining Parameter'] == 'PM2.5'), full_data['AQI'], np.nan) C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\2661443164.py:9: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy full_data['CO'] = full_data['CO'].fillna(0) C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\2661443164.py:10: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy full_data['NO2'] = full_data['NO2'].fillna(0) C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\2661443164.py:11: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy full_data['Ozone'] = full_data['Ozone'].fillna(0) C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\2661443164.py:12: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy full_data['PM10'] = full_data['PM10'].fillna(0) C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\2661443164.py:13: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy full_data['PM2.5'] = full_data['PM2.5'].fillna(0) C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\2661443164.py:16: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy full_data['CO'] = full_data['CO'].astype(str) C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\2661443164.py:17: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy full_data['NO2'] = full_data['NO2'].astype(str) C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\2661443164.py:18: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy full_data['Ozone'] = full_data['Ozone'].astype(str) C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\2661443164.py:19: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy full_data['PM10'] = full_data['PM10'].astype(str) C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\2661443164.py:20: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy full_data['PM2.5'] = full_data['PM2.5'].astype(str) C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\2661443164.py:23: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy full_data.drop(['Unnamed: 0'], axis=1, inplace=True) C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\2661443164.py:24: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy full_data.drop(['timezone'], axis=1, inplace=True)
# Show date from May 2020 to May 2022
full_data
| CBSA Code | Date | AQI | Category | Defining Parameter | Number of Sites Reporting | city_ascii | state_id | state_name | lat | lng | population | density | CO | NO2 | Ozone | PM10 | PM2.5 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 10140 | 2022-01-01 | 21 | Good | PM2.5 | 2 | Aberdeen | WA | Washington | 46.9757 | -123.8094 | 16571.0 | 588.0 | 0.0 | 0.0 | 0.0 | 0.0 | 21.0 |
| 1 | 10140 | 2022-01-02 | 12 | Good | PM2.5 | 2 | Aberdeen | WA | Washington | 46.9757 | -123.8094 | 16571.0 | 588.0 | 0.0 | 0.0 | 0.0 | 0.0 | 12.0 |
| 2 | 10140 | 2022-01-03 | 18 | Good | PM2.5 | 2 | Aberdeen | WA | Washington | 46.9757 | -123.8094 | 16571.0 | 588.0 | 0.0 | 0.0 | 0.0 | 0.0 | 18.0 |
| 3 | 10140 | 2022-01-04 | 19 | Good | PM2.5 | 2 | Aberdeen | WA | Washington | 46.9757 | -123.8094 | 16571.0 | 588.0 | 0.0 | 0.0 | 0.0 | 0.0 | 19.0 |
| 4 | 10140 | 2022-01-05 | 17 | Good | PM2.5 | 2 | Aberdeen | WA | Washington | 46.9757 | -123.8094 | 16571.0 | 588.0 | 0.0 | 0.0 | 0.0 | 0.0 | 17.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 359311 | 49740 | 2020-12-27 | 32 | Good | Ozone | 1 | Yuma | AZ | Arizona | 32.5995 | -114.5491 | 137612.0 | 311.0 | 0.0 | 0.0 | 32.0 | 0.0 | 0.0 |
| 359312 | 49740 | 2020-12-28 | 44 | Good | PM10 | 1 | Yuma | AZ | Arizona | 32.5995 | -114.5491 | 137612.0 | 311.0 | 0.0 | 0.0 | 0.0 | 44.0 | 0.0 |
| 359313 | 49740 | 2020-12-29 | 38 | Good | Ozone | 1 | Yuma | AZ | Arizona | 32.5995 | -114.5491 | 137612.0 | 311.0 | 0.0 | 0.0 | 38.0 | 0.0 | 0.0 |
| 359314 | 49740 | 2020-12-30 | 36 | Good | Ozone | 1 | Yuma | AZ | Arizona | 32.5995 | -114.5491 | 137612.0 | 311.0 | 0.0 | 0.0 | 36.0 | 0.0 | 0.0 |
| 359315 | 49740 | 2020-12-31 | 35 | Good | PM2.5 | 1 | Yuma | AZ | Arizona | 32.5995 | -114.5491 | 137612.0 | 311.0 | 0.0 | 0.0 | 0.0 | 0.0 | 35.0 |
302368 rows × 18 columns
# Summary Statistics of full_data
full_data.describe().style.background_gradient(cmap = "plasma")
| CBSA Code | AQI | Number of Sites Reporting | lat | lng | population | density | |
|---|---|---|---|---|---|---|---|
| count | 302368.000000 | 302368.000000 | 302368.000000 | 302368.000000 | 302368.000000 | 302368.000000 | 302368.000000 |
| mean | 29953.424701 | 41.652275 | 3.309636 | 38.607374 | -95.296459 | 466371.134455 | 1000.657966 |
| std | 11367.326282 | 24.314623 | 4.803664 | 5.607960 | 16.784485 | 1395576.343727 | 930.244469 |
| min | 10100.000000 | 0.000000 | 1.000000 | 19.688300 | -159.352100 | 1903.000000 | 4.000000 |
| 25% | 19820.000000 | 29.000000 | 1.000000 | 34.769000 | -109.220900 | 24834.000000 | 508.000000 |
| 50% | 29620.000000 | 38.000000 | 2.000000 | 39.379700 | -90.245100 | 101326.000000 | 776.000000 |
| 75% | 40140.000000 | 49.000000 | 3.000000 | 42.337200 | -82.151100 | 300268.000000 | 1183.000000 |
| max | 49740.000000 | 1250.000000 | 45.000000 | 64.835300 | -68.790600 | 18680025.000000 | 10768.000000 |
# Information Overview of full_data
full_data.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 302368 entries, 0 to 359315 Data columns (total 18 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CBSA Code 302368 non-null int64 1 Date 302368 non-null object 2 AQI 302368 non-null int64 3 Category 302368 non-null object 4 Defining Parameter 302368 non-null object 5 Number of Sites Reporting 302368 non-null int64 6 city_ascii 302368 non-null object 7 state_id 302368 non-null object 8 state_name 302368 non-null object 9 lat 302368 non-null float64 10 lng 302368 non-null float64 11 population 302368 non-null float64 12 density 302368 non-null float64 13 CO 302368 non-null object 14 NO2 302368 non-null object 15 Ozone 302368 non-null object 16 PM10 302368 non-null object 17 PM2.5 302368 non-null object dtypes: float64(4), int64(3), object(11) memory usage: 43.8+ MB
# Shape of full_data
full_data.shape
(302368, 18)
# Show the count of missing values in each column
full_data.isnull().sum()
CBSA Code 0 Date 0 AQI 0 Category 0 Defining Parameter 0 Number of Sites Reporting 0 city_ascii 0 state_id 0 state_name 0 lat 0 lng 0 population 0 density 0 CO 0 NO2 0 Ozone 0 PM10 0 PM2.5 0 dtype: int64
limit1 = 500 # maximum limit of aqi
date_column1 = full_data['Date'] # assign Date column from dataset into date_column variable
sn_column1 = full_data['state_name'] # assign state_name column from dataset into sn_column variable
aqi_column1 = full_data['AQI'] # assign aqi column from dataset into aqi_column variable
count_values1 = aqi_column1[aqi_column1 > limit1].count() # count the total number of AQI values
aqi_date1 = date_column1[aqi_column1 > limit1] # find date whose limit is greater than 500
aqi_state_name1 = sn_column1[aqi_column1 > limit1]
aqi_values1 = aqi_column1[aqi_column1 > limit1] # find AQI values whose limit is greater than 500
print('Total Number of AQI Values>500 from May 2020 to May 2022:', count_values1) # print the total number of AQI values
print('--------------------------------------------------------------') # print broken line
# Creating a DataFrame for outlier values of AQI
outlier_data = {
'Date': aqi_date1,
'State Name': aqi_state_name1,
'AQI': aqi_values1
}
outlier_df = pd.DataFrame(outlier_data)
# Display the DataFrame without the index
print(outlier_df.to_string(index=False))
Total Number of AQI Values>500 from May 2020 to May 2022: 33
--------------------------------------------------------------
Date State Name AQI
2022-02-23 New Mexico 775
2022-05-29 California 733
2022-04-11 California 644
2022-05-29 California 631
2021-10-11 California 552
2021-10-11 California 666
2021-03-16 New Mexico 665
2021-09-14 California 537
2020-09-08 Oregon 502
2020-09-11 Oregon 583
2020-09-12 Oregon 506
2020-09-08 California 684
2020-09-10 Oregon 510
2020-09-12 Oregon 550
2020-09-13 Oregon 502
2020-09-08 California 543
2020-09-07 Washington 908
2020-09-12 Oregon 679
2020-09-12 Oregon 517
2020-08-16 Arizona 1124
2020-11-07 Arizona 520
2020-09-13 Oregon 509
2020-09-12 Oregon 561
2020-06-05 California 576
2020-05-19 Wyoming 1250
2020-09-13 Oregon 518
2020-09-09 Oregon 550
2020-09-11 Oregon 641
2020-09-12 Oregon 526
2020-09-13 Oregon 548
2020-09-13 Washington 536
2020-09-07 Washington 819
2020-09-12 Washington 519
col = 'Date' # select column
# Converting Date Column to Datetime Format of Filtered Data 1
filtered_data_1[col] = pd.to_datetime(filtered_data_1[col], format='%Y-%m-%d')
# Converting Date Column to Datetime Format of Filtered Data 2
filtered_data_2[col] = pd.to_datetime(filtered_data_2[col], format='%Y-%m-%d')
print("Categories of Air Quality:\n",full_data.Category.unique()) # print AQI category
print("")# print new line
states_name = sorted(full_data.state_name.unique()) # sort state name
print("Name of the U.S. State:\n",sorted(states_name))# print statename in sorted
print("")# print new line
num_of_us_states = len(states_name) # length of state name
print("Total Number of the U.S. State:",num_of_us_states) # print length of state name
Categories of Air Quality: ['Good' 'Moderate' 'Unhealthy for Sensitive Groups' 'Unhealthy' 'Hazardous' 'Very Unhealthy'] Name of the U.S. State: ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming'] Total Number of the U.S. State: 50
fig1 = px.line(filtered_data_1, x="Date", y="AQI", color="state_name", title='Line graph of Air quality in the US from May 2020 to May 2021')
fig1.show() # Line graph of Air quality in the US from May 2020 to May 2021
fig2 = px.line(filtered_data_2, x="Date", y="AQI", color="state_name", title='Line graph of Air quality in the US from June 2021 to May 2022')
fig2.show() # Line graph of Air quality in the US from June 2021 to May 2022
# Air Quality Index (AQI) Categories - Color-Coded Rectangles
GOOD = plt.Rectangle((-1_000,0), 17_500, 50, fc='green',ec="green", alpha=0.25)
MODERATE = plt.Rectangle((-1_000,50), 17_500, 50, fc='yellow',ec="yellow", alpha=0.25)
UNHEALTHY_FOR_SENSITIVE = plt.Rectangle((-1_000,100), 17_500, 50,fc='orange',ec="orange", alpha=0.25)
UNHEALTHY = plt.Rectangle((-1_000,150), 17_500, 50, fc='red',ec="red", alpha=0.25)
VERY_UNHEALTHY = plt.Rectangle((-1_000,200), 17_500, 100, fc='purple',ec="purple", alpha=0.25)
HAZARDOUS = plt.Rectangle((-1_000,300), 17_500, 200, fc='maroon',ec="maroon", alpha=0.25)
# plots time series of US AQI
df1 = filtered_data_1.groupby('Date').AQI.mean().loc[:'2021-05-31']
df1.plot(figsize=(12,6), color='black', alpha=0.75)
plt.title('Time Series Plot of US AQI (May 2020 - May 2021)', {'size': 25, 'weight': 'bold'})
# Adds color patches
plt.gca().add_patch(copy(GOOD))
plt.gca().add_patch(copy(MODERATE))
plt.gca().add_patch(copy(UNHEALTHY_FOR_SENSITIVE))
plt.gca().add_patch(copy(UNHEALTHY))
plt.gca().add_patch(copy(VERY_UNHEALTHY))
plt.gca().add_patch(copy(HAZARDOUS))
# x and y axis labeling
plt.xlabel('Date', {'size': 14, 'weight': 'bold'})
plt.ylabel('AQI', {'size': 14, 'weight': 'bold'})
# Set limit of y axis
plt.ylim(0, 250)
#Sets up legend
legend_elements = [
Patch(facecolor='green', edgecolor='black', alpha=0.5, label='Good (0-50)'),
Patch(facecolor='yellow', edgecolor='black', alpha=0.5, label='Moderate (51-100)'),
Patch(facecolor='orange', edgecolor='black', alpha=0.5, label='Unhealthy for Sensitive Groups (101-150)'),
Patch(facecolor='red', edgecolor='black', alpha=0.5, label='Unhealthy (151-200)'),
Patch(facecolor='purple', edgecolor='black', alpha=0.5, label='Very Unhealthy (201-300)'),
Patch(facecolor='maroon', edgecolor='black', alpha=0.5, label='Hazardous (301-500)'),
]
legend = plt.legend(handles=legend_elements)
legend.set_title("AQI Category", prop = {'size':12, 'weight': 'bold'})
#displays the plot
plt.show()
#Air Quality Index (AQI) Categories - Color-Coded Rectangles
GOOD = plt.Rectangle((-1_000,0), 17_500, 50, fc='green',ec="green", alpha=0.25)
MODERATE = plt.Rectangle((-1_000,50), 17_500, 50, fc='yellow',ec="yellow", alpha=0.25)
UNHEALTHY_FOR_SENSITIVE = plt.Rectangle((-1_000,100), 17_500, 50,fc='orange',ec="orange", alpha=0.25)
UNHEALTHY = plt.Rectangle((-1_000,150), 17_500, 50, fc='red',ec="red", alpha=0.25)
VERY_UNHEALTHY = plt.Rectangle((-1_000,200), 17_500, 100, fc='purple',ec="purple", alpha=0.25)
HAZARDOUS = plt.Rectangle((-1_000,300), 17_500, 200, fc='maroon',ec="maroon", alpha=0.25)
# plots time series of US average AQI
df2 = filtered_data_2.groupby('Date').AQI.mean().loc[:'2022-05-31']
df2.plot(figsize=(12,6), color='black', alpha=0.75)
plt.title('Time Series Plot of US AQI (June 2021 - May 2022)', {'size': 25, 'weight': 'bold'})
# Adds color patches
plt.gca().add_patch(copy(GOOD))
plt.gca().add_patch(copy(MODERATE))
plt.gca().add_patch(copy(UNHEALTHY_FOR_SENSITIVE))
plt.gca().add_patch(copy(UNHEALTHY))
plt.gca().add_patch(copy(VERY_UNHEALTHY))
plt.gca().add_patch(copy(HAZARDOUS))
# x and y axis labeling
plt.xlabel('Date', {'size': 14, 'weight': 'bold'})
plt.ylabel('AQI', {'size': 14, 'weight': 'bold'})
# Set limit of y axis
plt.ylim(0, 250)
#Sets up legend
legend_elements = [
Patch(facecolor='green', edgecolor='black', alpha=0.5, label='Good (0-50)'),
Patch(facecolor='yellow', edgecolor='black', alpha=0.5, label='Moderate (51-100)'),
Patch(facecolor='orange', edgecolor='black', alpha=0.5, label='Unhealthy for Sensitive Groups (101-150)'),
Patch(facecolor='red', edgecolor='black', alpha=0.5, label='Unhealthy (151-200)'),
Patch(facecolor='purple', edgecolor='black', alpha=0.5, label='Very Unhealthy (201-300)'),
Patch(facecolor='maroon', edgecolor='black', alpha=0.5, label='Hazardous (301-500)'),
]
legend = plt.legend(handles=legend_elements)
legend.set_title("AQI Category", prop = {'size':12, 'weight': 'bold'})
#displays the plot
plt.show()
sns.set(rc = {"figure.figsize":(12, 6)}) #graph size
plt.style.use("fivethirtyeight")
# bar plot of overall good aqi values (0-50) in US from May 2020-May 2021
plot = sns.countplot(data = filtered_data_1, x = "AQI", color = "#00DB16")
plt.title("Overall Count of Good AQI Values (AQI 0-50) in the USA from May 2020-May 2021")
plot.set_xticklabels(plot.get_xticklabels(), fontsize = 9)
plt.xlim(0, 51)
plt.figure(figsize = (10, 12))
plt.show()
<Figure size 1000x1200 with 0 Axes>
# bar plot of overall good aqi values (0-50) in US from June 2021-May 2022
plot = sns.countplot(data = filtered_data_2, x = "AQI", color = "#00DB16")
plt.title("Overall Count of Good AQI Values (AQI 0-50) in the USA from June 2021-May 2022")
plot.set_xticklabels(plot.get_xticklabels(), fontsize = 9)
plt.xlim(0, 51)
plt.figure(figsize = (10, 12))
plt.show()
<Figure size 1000x1200 with 0 Axes>
# bar plot of overall moderate aqi values (50-100) in US from May 2020-May 2021
plot = sns.countplot(data = filtered_data_1, x = "AQI", color = "#F3FC00")
plt.title("Overall Count of Moderate AQI Values (AQI 50-100) in the USA from May 2020-May 2021")
plot.set_xticklabels(plot.get_xticklabels(), fontsize = 9)
plt.xlim(50, 101)
plt.figure(figsize = (10, 12))
plt.show()
<Figure size 1000x1200 with 0 Axes>
# bar plot of overall moderate aqi values (50-100) in US from June 2021-May 2022
plot = sns.countplot(data = filtered_data_2, x = "AQI", color = "#F3FC00")
plt.title("Overall Count of Moderate AQI Values (AQI 50-100) in the USA from June 2021-May 2022")
plot.set_xticklabels(plot.get_xticklabels(), fontsize = 9)
plt.xlim(50, 101)
plt.figure(figsize = (10, 12))
plt.show()
<Figure size 1000x1200 with 0 Axes>
# bar plot of overall moderate aqi values (50-100) in US from May 2020-May 2021
plot = sns.countplot(data = filtered_data_1, x = "AQI", color = "#FFA200")
plt.title("Overall Count of Unhealthy for Sensitive Groups AQI Values (AQI 100-150) in the USA from May 2020-May 2021")
plot.set_xticklabels(plot.get_xticklabels(), fontsize = 9)
plt.xlim(100, 150)
plt.figure(figsize = (10, 12))
plt.show()
<Figure size 1000x1200 with 0 Axes>
# bar plot of overall moderate aqi values (50-100) in US from June 2021-May 2022
plot = sns.countplot(data = filtered_data_2, x = "AQI", color = "#FFA200")
plt.title("Overall Count of Unhealthy for Sensitive Groups AQI Values (AQI 100-150) in the USA from June 2021-May 2022")
plot.set_xticklabels(plot.get_xticklabels(), fontsize = 9)
plt.xlim(100, 150)
plt.figure(figsize = (10, 12))
plt.show()
<Figure size 1000x1200 with 0 Axes>
for state in sorted(['Washington', 'Ohio', 'Georgia', 'Oregon', 'New York', 'New Mexico', 'Texas',
'Michigan', 'Wisconsin', 'Oklahoma', 'Arkansas', 'Maine', 'California',
'Maryland', 'Louisiana', 'Nebraska', 'Montana', 'North Dakota', 'Virginia',
'Illinois', 'Indiana', 'Massachusetts', 'Colorado', 'Kentucky', 'Florida',
'Utah', 'Iowa', 'West Virginia', 'South Carolina', 'North Carolina',
'Tennessee', 'New Hampshire', 'Mississippi', 'Missouri', 'Alabama',
'Minnesota', 'Arizona', 'Pennsylvania', 'Wyoming', 'Hawaii',
'Idaho', 'Nevada', 'Rhode Island', 'New Jersey',
'South Dakota', 'Alaska', 'Connecticut', 'Vermont', 'Kansas', 'Delaware']): # for loop of sorted state name
df1 = filtered_data_1[filtered_data_1["state_name"] == state]
# bar plot of good aqi values (0-50) in US from May 2020-May 2021
plot = sns.countplot(data = df1, x = "AQI", color = "#00DB16")
plt.title(f"Count of Good AQI Values (AQI 0-50) in {state}")
plot.set_xticklabels(plot.get_xticklabels(), fontsize = 9)
plt.xlim(0, 51)
plt.show()
df2 = filtered_data_1[filtered_data_1["state_name"] == state]
# bar plot of moderate aqi values (50-100) in US from May 2020-May 2021
plot = sns.countplot(data = df2, x = "AQI", color = "#F3FC00")
plt.title(f"Count of Moderate AQI Values (AQI 50-100) in {state}")
plot.set_xticklabels(plot.get_xticklabels(), fontsize = 9)
plt.xlim(50, 100)
plt.show()
df3 = filtered_data_1[filtered_data_1["state_name"] == state]
# bar plot of unhealthy aqi values (100-150) in US from May 2020-May 2021
plot = sns.countplot(data = df3, x = "AQI", color = "#FFA200")
plt.title(f"Unhealthy for Sensitive Groups AQI Values (AQI 100-150) in {state}")
plot.set_xticklabels(plot.get_xticklabels(), fontsize = 9)
plt.xlim(100, 150)
plt.show()
for state in sorted(['Washington', 'Ohio', 'Georgia', 'Oregon', 'New York', 'New Mexico', 'Texas',
'Michigan', 'Wisconsin', 'Oklahoma', 'Arkansas', 'Maine', 'California',
'Maryland', 'Louisiana', 'Nebraska', 'Montana', 'North Dakota', 'Virginia',
'Illinois', 'Indiana', 'Massachusetts', 'Colorado', 'Kentucky', 'Florida',
'Utah', 'Iowa', 'West Virginia', 'South Carolina', 'North Carolina',
'Tennessee', 'New Hampshire', 'Mississippi', 'Missouri', 'Alabama',
'Minnesota', 'Arizona', 'Pennsylvania', 'Wyoming', 'Hawaii',
'Idaho', 'Nevada', 'Rhode Island', 'New Jersey',
'South Dakota', 'Alaska', 'Connecticut', 'Vermont', 'Kansas', 'Delaware']): # for loop of sorted state name
df4 = filtered_data_2[filtered_data_2["state_name"] == state]
# bar plot of good aqi values (0-50) in US from June 2021-May 2022
plot = sns.countplot(data = df4, x = "AQI", color = "#00DB16")
plt.title(f"Count of Good AQI Values (AQI 0-50) in {state}")
plot.set_xticklabels(plot.get_xticklabels(), fontsize = 9)
plt.xlim(0, 51)
plt.show()
df5 = filtered_data_2[filtered_data_2["state_name"] == state]
# bar plot of moderate aqi values (50-100) in US from June 2021-May 2022
plot = sns.countplot(data = df5, x = "AQI", color = "#F3FC00")
plt.title(f"Count of Moderate AQI Values (AQI 50-100) in {state}")
plot.set_xticklabels(plot.get_xticklabels(), fontsize = 9)
plt.xlim(50, 100)
plt.show()
df6 = filtered_data_2[filtered_data_2["state_name"] == state]
# bar plot of unhealthy aqi values (100-150) in US from June 2021-May 2022
plot = sns.countplot(data = df6, x = "AQI", color = "#FFA200")
plt.title(f"Unhealthy for Sensitive Groups AQI Values (AQI 100-150) in {state}")
plot.set_xticklabels(plot.get_xticklabels(), fontsize = 9)
plt.xlim(100, 150)
plt.show()
# convert date to datetime of full_data
full_data.Date = pd.to_datetime(full_data.Date)
# extract year from date of full_data
full_data.insert(2,'Year', full_data.Date.dt.year)
# extract month from date of full_data
full_data.insert(3,'Month', full_data.Date.dt.month)
# extract day from date of full_data
full_data.insert(4,'Day', full_data.Date.dt.day)
C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\1187036773.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
full_data.Year.unique() # print unique years of full_data
array([2022, 2021, 2020], dtype=int64)
def labels(Title, Xlabel=None, Ylabel=None, Rotation= None): #definition of labels for graph title, x and y labels and and rotation of xticks
"""
This function return title, xlabel, ylabel and rotation of xticks
"""
plt.title(Title, fontsize= 22)
plt.xlabel(Xlabel, fontsize= 18)
plt.ylabel(Ylabel, fontsize= 18)
plt.xticks(rotation= Rotation);
# no. of testing along each year
plt.style.use('dark_background')
sns.countplot(x=full_data.Year, palette='flare')
labels('Number of Values Along Each Year', 'Year', 'No. of Testing', 45)
# total number of values of year 2020, 2021 and 2022
year_2020_total = full_data[full_data['Year'] == 2020].shape[0]
year_2021_total = full_data[full_data['Year'] == 2021].shape[0]
year_2022_total = full_data[full_data['Year'] == 2022].shape[0]
print(f"Total values for 2020: {year_2020_total}")
print(f"Total values for 2021: {year_2021_total}")
print(f"Total values for 2022: {year_2022_total}")
Total values for 2020: 111610 Total values for 2021: 165294 Total values for 2022: 25464
# value counts of aqi category
full_data['Category'].value_counts()
Good 234713 Moderate 61009 Unhealthy for Sensitive Groups 4505 Unhealthy 1643 Very Unhealthy 348 Hazardous 150 Name: Category, dtype: int64
# bar plot showing percentage of defining parameter (CO, NO2, Ozone, PM2.5, PM10)
pmf = Pmf.from_seq(full_data['Defining Parameter'])
plt.style.use('ggplot')
plt.bar(pmf.index, pmf.values)
labels('Percentage of Defination Parameter', 'Parameter', 'Percentage')
# X and y are the input and target variables
X = full_data[['Ozone','PM10','PM2.5']]
y = full_data['AQI']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Train a Linear Regression
linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)
linear_reg_pred = linear_reg.predict(X_test)
# Train a Decision Tree Regressor
tree_reg = DecisionTreeRegressor()
tree_reg.fit(X_train, y_train)
tree_reg_pred = tree_reg.predict(X_test)
# Train a Random Forest Regressor
forest_reg = RandomForestRegressor()
forest_reg.fit(X_train, y_train)
forest_reg_pred = forest_reg.predict(X_test)
# Train a Gradient Boosting Regressor
gb_reg = GradientBoostingRegressor()
gb_reg.fit(X_train, y_train)
gb_reg_pred = gb_reg.predict(X_test)
# Function to calculate metrics
def calculate_metrics(y_true, y_pred):
rmse = np.sqrt(mean_squared_error(y_true, y_pred))
mse = mean_squared_error(y_true, y_pred)
mae = mean_absolute_error(y_true, y_pred)
return rmse, mse, mae
# Calculate metrics for each model
models = {
'Linear Regression': linear_reg_pred,
'Decision Tree Regressor': tree_reg_pred,
'Random Forest Regressor': forest_reg_pred,
'Gradient Boosting Regressor': gb_reg_pred
}
for name, predictions in models.items():
rmse, mse, mae = calculate_metrics(y_test, predictions)
print(f"Model Evaluation Metrics for {name}:")
print(f"RMSE: {rmse:.4f}") # value of root mean squared error (RMSE)
print(f"MSE: {mse:.4f}") # value of mean squared error (MSE)
print(f"MAE: {mae:.4f}") # value of mean average error (MAE)
print("---------------------")
Model Evaluation Metrics for Linear Regression: RMSE: 2.8401 MSE: 8.0661 MAE: 0.5574 --------------------- Model Evaluation Metrics for Decision Tree Regressor: RMSE: 1.7845 MSE: 3.1843 MAE: 0.1469 --------------------- Model Evaluation Metrics for Random Forest Regressor: RMSE: 1.7800 MSE: 3.1683 MAE: 0.1464 --------------------- Model Evaluation Metrics for Gradient Boosting Regressor: RMSE: 1.9491 MSE: 3.7991 MAE: 0.4949 ---------------------